/*! \file
Copyright (c) 2003, The Regents of the University of California, through
Lawrence Berkeley National Laboratory (subject to receipt of any required 
approvals from U.S. Dept. of Energy) 

All rights reserved. 

The source code is distributed under BSD license, see the file License.txt
at the top-level directory.
*/
/*! @file
 * \brief Memory utilities
 *
 * <pre>
 * -- Distributed SuperLU routine (version 5.2) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley.
 * September 1, 1999
 * 
 * Modified:
 *   September 30, 2017, add aligned malloc for Intel
 * </pre>
 */

#include "superlu_ddefs.h"

/*
 * Global variables
 */
SuperLU_ExpHeader *expanders; /* Array of pointers to 4 types of memory */
SuperLU_LU_stack_t stack;
int_t no_expand;


/*
 * Prototype
 */
static int_t memory_usage(const int_t, const int_t, const int_t);
static void *expand(int_t *, MemType, int_t, int_t,
		    Glu_freeable_t *);

/*
 * Internal prototypes
 */
void  SetupSpace (void *, int_t, LU_space_t *);


void
superlu_abort_and_exit_dist(char *msg)
{
    /*fprintf(stderr, msg);
    fflush(stderr);*/
    printf("%s", msg);
    exit (-1);
}

long int superlu_malloc_total = 0;

#if ( DEBUGlevel>=1 )           /* Debug malloc/free. */

#define PAD_FACTOR  2
#define DWORD  (sizeof(double)) /* Be sure it's no smaller than double. */

void *superlu_malloc_dist(size_t size)
{
    char *buf;
    int iam;

    MPI_Comm_rank(MPI_COMM_WORLD, &iam);
    if ( size <= 0 ) {
	printf("(%d) superlu_malloc size %lld\n", iam, size);
	ABORT("superlu_malloc: nonpositive size");
    }
    buf = (char *) malloc(size + DWORD);
    if ( !buf ) {
	printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lld\n",
	       iam, superlu_malloc_total*1e-6, size);
	ABORT("superlu_malloc: out of memory");
    }

    ((size_t *) buf)[0] = size;
#if 0
    superlu_malloc_total += size + DWORD;
#else
    superlu_malloc_total += size;
#endif
    return (void *) (buf + DWORD);
}

void superlu_free_dist(void *addr)
{
    char *p = ((char *) addr) - DWORD;

    if ( !addr )
	ABORT("superlu_free: tried to free NULL pointer");

    if ( !p )
	ABORT("superlu_free: tried to free NULL+DWORD pointer");

    { 
	int_t n = ((size_t *) p)[0];
	//printf("superlu_free-dist: n %d\n", n);
	
	if ( n==0 ) {
	    ABORT("superlu_free: tried to free a freed pointer");
	}
	*((size_t *) p) = 0; /* Set to zero to detect duplicate free's. */
#if 0	
	superlu_malloc_total -= (n + DWORD);
#else
	superlu_malloc_total -= n;
#endif

	if ( superlu_malloc_total < 0 )
	    ABORT("superlu_malloc_total went negative");
	
	/*free (addr);*/
	free (p);
    }

}
 
#else  /* The production mode. */

//#if  0 
#if (__STDC_VERSION__ >= 201112L)

void * superlu_malloc_dist(size_t size) {void* ptr;int alignment=1<<12;if(size>1<<19){alignment=1<<21;}posix_memalign( (void**)&(ptr), alignment, size );return(ptr);}
void   superlu_free_dist(void * ptr)    {free(ptr);}

#elif defined (__INTEL_COMPILER)
#include <immintrin.h>
void * superlu_malloc_dist(size_t size) {
    void* ptr;
    int alignment = 1<<12; // align at 4K page
    if (size > 1<<19 ) { alignment=1<<21; }
    return (_mm_malloc(size, alignment));
}
void  superlu_free_dist(void * ptr)  { _mm_free(ptr); }

#else // normal malloc/free 

void *superlu_malloc_dist(size_t size) {
    void *buf;
    buf = (void *) malloc(size);
    return (buf);
}
void superlu_free_dist(void *addr) { free (addr); }

#endif

#endif  /* End debug malloc/free. */



static void
copy_mem_int(int_t howmany, void *old, void *new)
{
    register int_t i;
    int_t *iold = old;
    int_t *inew = new;
    for (i = 0; i < howmany; i++) inew[i] = iold[i];
}


static void
user_bcopy(char *src, char *dest, int_t bytes)
{
    char *s_ptr, *d_ptr;

    s_ptr = src + bytes - 1;
    d_ptr = dest + bytes - 1;
    for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr;
}



int_t *intMalloc_dist(int_t n)
{
    int_t *buf;
    buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t));
    return (buf);
}

int_t *intCalloc_dist(int_t n)
{
    int_t *buf;
    register int_t i;
    buf = (int_t *) SUPERLU_MALLOC((size_t) SUPERLU_MAX(1,n) * sizeof(int_t));
    if ( buf )
	for (i = 0; i < n; ++i) buf[i] = 0;
    return (buf);
}


void *user_malloc_dist(int_t bytes, int_t which_end)
{
    void *buf;
    
    if ( SuperLU_StackFull(bytes) ) return (NULL);

    if ( which_end == HEAD ) {
	buf = (char*) stack.array + stack.top1;
	stack.top1 += bytes;
    } else {
	stack.top2 -= bytes;
	buf = (char*) stack.array + stack.top2;
    }
    
    stack.used += bytes;
    return buf;
}

void user_free_dist(int_t bytes, int_t which_end)
{
    if ( which_end == HEAD ) {
	stack.top1 -= bytes;
    } else {
	stack.top2 += bytes;
    }
    stack.used -= bytes;
}


/*! \brief
 *
 * <pre>
 * Setup the memory model to be used for factorization.
 *    lwork = 0: use system malloc;
 *    lwork > 0: use user-supplied work[] space.
 * </pre>
 */
void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel)
{
    if ( lwork == 0 ) {
	*MemModel = SYSTEM; /* malloc/free */
    } else if ( lwork > 0 ) {
	*MemModel = USER;   /* user provided space */
	stack.used = 0;
	stack.top1 = 0;
	stack.top2 = (lwork/4)*4; /* must be word addressable */
	stack.size = stack.top2;
	stack.array = (void *) work;
    }
}


/************************************************************************/
/*! \brief
 *
 * <pre>
 * Allocate storage for the data structures common to symbolic factorization
 * routines. For those unpredictable size, make a guess as FILL * nnz(A).
 * Return value:
 *     If lwork = -1, return the estimated amount of space required, plus n;
 *     otherwise, return the amount of space actually allocated when
 *     memory allocation failure occurred.
 * </pre>
 */

int_t symbfact_SubInit
/************************************************************************/
(
 fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz,
 Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable
 )
{
    int_t  iword;
    int_t  *xsup, *supno;
    int_t  *lsub, *xlsub;
    int_t  *usub, *xusub;
    int_t  nzlmax, nzumax;
    int_t  FILL = sp_ienv_dist(6);
    int iam;

#if ( DEBUGlevel>=1 )
    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
    CHECK_MALLOC(iam, "Enter symbfact_SubInit()");
#endif

    no_expand = 0;
    iword     = sizeof(int_t);

    expanders = (SuperLU_ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(SuperLU_ExpHeader) );
    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
    
    if ( fact == DOFACT || fact == SamePattern ) {
	/* Guess for L\U factors */
	nzlmax = FILL * annz;
	nzumax = FILL/2.0 * annz;

	if ( lwork == -1 ) {
	    return ( SuperLU_GluIntArray(n) * iword + SuperLU_TempSpace(m,1)
		    + (nzlmax+nzumax)*iword + n );
        } else {
	    SetupSpace(work, lwork, &Glu_freeable->MemModel);
	}
	
#if ( PRNTlevel>=2 )
	printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n", 
		annz, nzlmax, nzumax);
#endif	
	
	/* Integer pointers for L\U factors */
	if ( Glu_freeable->MemModel == SYSTEM ) {
	    xsup   = intMalloc_dist(n+1);
	    supno  = intMalloc_dist(n+1);
	    xlsub  = intMalloc_dist(n+1);
	    xusub  = intMalloc_dist(n+1);
	} else {
	    xsup   = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
	    supno  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
	    xlsub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
	    xusub  = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
	}

	lsub  = (int_t *) expand(&nzlmax, (MemType) LSUB, 0, 0, Glu_freeable);
	usub  = (int_t *) expand(&nzumax, (MemType) USUB, 0, 0, Glu_freeable);

	while ( !lsub || !usub ) {
	    if ( Glu_freeable->MemModel == SYSTEM ) {
		SUPERLU_FREE(lsub); 
		SUPERLU_FREE(usub);
	    } else {
		user_free_dist((nzlmax+nzumax)*iword, HEAD);
	    }
	    nzlmax /= 2;
	    nzumax /= 2;
	    if ( nzumax < annz/2 ) {
		printf("Not enough memory to perform factorization.\n");
		return (memory_usage(nzlmax, nzumax, n) + n);
	    }
#if ( PRNTlevel>=1 )
	    printf("(%d).. symbfact_SubInit() reduce size:"
		   "nzlmax %lld, nzumax %lld\n", iam, (long long) nzlmax, (long long) nzumax);
	    fflush(stdout);
#endif
	    lsub  = (int_t *) expand( &nzlmax, (MemType) LSUB, 0, 0, Glu_freeable );
	    usub  = (int_t *) expand( &nzumax, (MemType) USUB, 0, 1, Glu_freeable );
	}

	Glu_persist->xsup    = xsup;
	Glu_persist->supno   = supno;
	Glu_freeable->lsub   = lsub;
	Glu_freeable->xlsub  = xlsub;
	Glu_freeable->usub   = usub;
	Glu_freeable->xusub  = xusub;
	Glu_freeable->nzlmax = nzlmax;
	Glu_freeable->nzumax = nzumax;
    } else {
	/* fact == SamePattern_SameRowPerm */
	if ( lwork == -1 ) {
	    return ( SuperLU_GluIntArray(n) * iword + SuperLU_TempSpace(m, 1)
		    + (nzlmax+nzumax)*iword + n );
        } else if ( lwork == 0 ) {
	    Glu_freeable->MemModel = SYSTEM;
	} else {
	    Glu_freeable->MemModel = USER;
	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
	    stack.size = stack.top2;
	}
	
	expanders[USUB].mem = Glu_freeable->usub;
	expanders[LSUB].mem = Glu_freeable->lsub;
	expanders[USUB].size = nzumax;
	expanders[LSUB].size = nzlmax;
    }

    ++no_expand;

#if ( DEBUGlevel>=1 )
    /* Memory allocated but not freed: xsup, supno */
    CHECK_MALLOC(iam, "Exit symbfact_SubInit()");
#endif

    return 0;
    
} /* SYMBFACT_SUBINIT */

/************************************************************************/
/*! \brief
 *
 * <pre>
 * Expand the data structures for L and U during the factorization.
 * Return value:   0 - successful return
 *               > 0 - number of bytes allocated when run out of space
 * </pre>
 */

int_t symbfact_SubXpand
/************************************************************************/
(
 int_t n,           /* total number of columns */
 int_t jcol,        /* current column */
 int_t next,        /* number of elements currently in the factors */
 MemType mem_type,  /* which type of memory to expand  */
 int_t *maxlen,     /* modified - maximum length of a data structure */
 Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
 )
{
    void   *new_mem;
    
#if ( DEBUGlevel>=1 )
    printf("symbfact_SubXpand(): jcol " IFMT ", next " IFMT ", maxlen " IFMT
	   ", MemType " IFMT "\n",
	   jcol, next, *maxlen, mem_type);
#endif    

    new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable);
    
    if ( !new_mem ) {
	int_t    nzlmax  = Glu_freeable->nzlmax;
	int_t    nzumax  = Glu_freeable->nzumax;
    	fprintf(stderr, "Can't expand MemType %d: jcol " IFMT "\n", mem_type, jcol);
    	return (memory_usage(nzlmax, nzumax, n) + n);
    }

    if ( mem_type == LSUB ) {
	Glu_freeable->lsub   = (int_t *) new_mem;
	Glu_freeable->nzlmax = *maxlen;
    } else if ( mem_type == USUB ) {
	Glu_freeable->usub   = (int_t *) new_mem;
	Glu_freeable->nzumax = *maxlen;
    } else ABORT("Tries to expand nonexisting memory type.\n");
    
    return 0;
    
} /* LUSUB_XPAND */

/************************************************************************/
/*! \brief
 *
 * <pre>
 * Deallocate storage of the data structures common to symbolic
 * factorization routines.
 * </pre>
 */

int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable)
/************************************************************************/
{
#if ( DEBUGlevel>=1 )
    int iam;
    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
    CHECK_MALLOC(iam, "Enter symbfact_SubFree()");
#endif
    
    SUPERLU_FREE(expanders);
    SUPERLU_FREE(Glu_freeable->lsub);
    SUPERLU_FREE(Glu_freeable->xlsub);
    SUPERLU_FREE(Glu_freeable->usub);
    SUPERLU_FREE(Glu_freeable->xusub);

#if ( DEBUGlevel>=1 )    
    CHECK_MALLOC(iam, "Exit symbfact_SubFree()");
#endif
    return 0;
} /* SYMBFACT_SUBFREE */

/************************************************************************/
/*! \brief
 *
 * <pre>
 * Expand the existing storage to accommodate more fill-ins.
 * </pre>
 */

static void *expand
/************************************************************************/
(
 int_t *prev_len,   /* length used from previous call */
 MemType type,    /* which part of the memory to expand */
 int_t len_to_copy, /* size of the memory to be copied to new store */
 int_t keep_prev,   /* = 1: use prev_len;
		     = 0: compute new_len to expand */
 Glu_freeable_t *Glu_freeable  /* modified - global LU data structures */
 )
{
    float    EXPAND = 1.5;
    float    alpha;
    void     *new_mem;
    int_t    new_len, tries, lword, extra, bytes_to_copy;

    alpha = EXPAND;
    lword = sizeof(int_t);

    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
        new_len = *prev_len;
    else {
	new_len = alpha * *prev_len;
    }

    if ( Glu_freeable->MemModel == SYSTEM ) {
	new_mem = (void *) SUPERLU_MALLOC((size_t) new_len * lword);
	/*new_mem = (void *) calloc(new_len, lword); */
	if ( no_expand != 0 ) {
	    tries = 0;
	    if ( keep_prev ) {
		if ( !new_mem ) return (NULL);
	    } else {
		while ( !new_mem ) {
		    if ( ++tries > 10 ) return (NULL);
		    alpha = SuperLU_Reduce(alpha);
		    new_len = alpha * *prev_len;
		    new_mem = (void*) SUPERLU_MALLOC((size_t)new_len * lword); 
		    /* new_mem = (void *) calloc(new_len, lword); */
		}
	    }
	    copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
	    SUPERLU_FREE (expanders[type].mem);
	}
	expanders[type].mem = (void *) new_mem;
	
    } else { /* MemModel == USER */
	if ( no_expand == 0 ) {
	    new_mem = user_malloc_dist((size_t)new_len * lword, HEAD);
	    expanders[type].mem = (void *) new_mem;
	}
	else {
	    tries = 0;
	    extra = (new_len - *prev_len) * lword;
	    if ( keep_prev ) {
		if ( SuperLU_StackFull(extra) ) return (NULL);
	    } else {
		while ( SuperLU_StackFull(extra) ) {
		    if ( ++tries > 10 ) return (NULL);
		    alpha = SuperLU_Reduce(alpha);
		    new_len = alpha * *prev_len;
		    extra = (new_len - *prev_len) * lword;	    
		}
	    }

	    if ( type != USUB ) {
		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
		bytes_to_copy = (char*)stack.array + stack.top1
		    - (char*)expanders[type + 1].mem;
		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);

		if ( type < USUB ) {
		    Glu_freeable->usub = expanders[USUB].mem =
			(void*)((char*)expanders[USUB].mem + extra);
		}
		if ( type < LSUB ) {
		    Glu_freeable->lsub = expanders[LSUB].mem =
			(void*)((char*)expanders[LSUB].mem + extra);
		}
		stack.top1 += extra;
		stack.used += extra;
		
	    } /* if ... */

	} /* else ... */
    }

    expanders[type].size = new_len;
    *prev_len = new_len;
    if ( no_expand ) ++no_expand;
    
    return (void *) expanders[type].mem;
    
} /* EXPAND */

/************************************************************************/
/*! \brief
 *
 * <pre>
 * mem_usage consists of the following fields:
 *    - for_lu (float)
 *      The amount of space used in bytes for the L\U data structures.
 *    - total (float)
 *      The amount of space needed in bytes to perform factorization.
 *    - expansions (int)
 *      Number of memory expansions during the LU factorization.
 * </pre>
 */

int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable,
		      superlu_dist_mem_usage_t *mem_usage)
/************************************************************************/
{
    register int_t iword = sizeof(int_t);
    extern int_t no_expand;

    /* For the adjacency graphs of L and U. */
    /*mem_usage->for_lu = (float)( (4*n + 3) * iword +
				Glu_freeable->xlsub[n]*iword );*/
    mem_usage->for_lu = (float)( (4*n + 3) * iword +
				lsub_size * iword );
    mem_usage->for_lu += (float)( (n + 1) * iword +
				 Glu_freeable->xusub[n]*iword );

    /* Working storage to support factorization */
    mem_usage->total = mem_usage->for_lu + 9*n*iword;

    mem_usage->expansions = --no_expand;
    return 0;
} /* QUERYSPACE_DIST */

static int_t
memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n)
{
    register int_t iword = sizeof(int_t);
    return (10*n*iword + (nzlmax+nzumax)*iword);
}

